library(tidyverse)
library(janitor)Olivia’s Cleaning
Goals of this notebook
The steps we’ll take to prepare our data:
- Download the data
- Import it into our notebook
- Clean up data types and columns
- Export the data for next notebook
Setup
Importing data
We’re importing the data which was collected in a PIR by KUT’s Lauren McGaughy.
spurs_raw <- read_csv("data-raw/spurs_raw_data.csv") |> clean_names()Rows: 50162 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (15): Arrest Date, Arrest County, Arrest Division, Arrest State, Arrest ...
dbl (2): Arrest ID, Person Age
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spurs_rawChanging name
spurs_renamed <- spurs_raw |>
rename(arrest_date_old = arrest_date,
charge = offense,
severity_code = level_group,
arrest_officer = arrested_by,
gender_code = person_sex,
statute = citation,
ethnicity_race = person_race
)
spurs_renamedChanging dates
spurs_date <- spurs_renamed |>
mutate(
charge_date = mdy_hm(arrest_date_old) |> date()
)Changing officer column
spurs_officer_cleaning <- spurs_date |>
mutate(new_officer = str_split_i(arrest_officer, " - ", 1), .after = arrest_officer,
id_officer = str_split_i(arrest_officer, " - ", 2)
)
spurs_officer_cleaningChanging ethnicity column
spurs_race <- spurs_officer_cleaning |>
mutate(
person_race_abbr = case_match(ethnicity_race, "White" ~ "W", "Hispanic" ~ "H", "Hispanic or Latino" ~ "H", "Black" ~ "B", "Asian" ~ "A"),
person_gender_abbr = case_match(gender_code, "M - Male" ~ "M", "F - Female" ~ "F")
)
spurs_raceChanging names to uppercase
spurs_uppercase <- spurs_race |>
mutate(person_first_name = str_to_upper(person_first_name),
person_last_name = str_to_upper(person_last_name))
spurs_uppercaseCreating a charge count
spurs_charge_count <- spurs_uppercase |>
group_by(arrest_date_old, arrest_county, arrest_division, arrest_id, arrest_state, arrest_urn, arrest_officer, new_officer, id_officer, statute, code, inv_number, severity_code, charge, person_first_name, person_last_name, ethnicity_race, gender_code, person_age, charge_date, person_race_abbr, person_gender_abbr ) |>
summarise(charge_count = n()) |>
mutate(charge_count = as.numeric(charge_count))`summarise()` has grouped output by 'arrest_date_old', 'arrest_county',
'arrest_division', 'arrest_id', 'arrest_state', 'arrest_urn', 'arrest_officer',
'new_officer', 'id_officer', 'statute', 'code', 'inv_number', 'severity_code',
'charge', 'person_first_name', 'person_last_name', 'ethnicity_race',
'gender_code', 'person_age', 'charge_date', 'person_race_abbr'. You can
override using the `.groups` argument.
spurs_charge_countSeverity code
spurs_code_clean <- spurs_charge_count |>
mutate(severity_code_clean = str_remove(severity_code, "SPURS "), .after = severity_code)
spurs_code_cleanSelecting which columns I actually need
spurs_clean <- spurs_code_clean |> ungroup() |>
select(
arrest_county,
arrest_division,
arrest_id,
arrest_state,
arrest_urn,
arresting_officer = new_officer,
spurs_officer_id = id_officer,
statute,
code,
inv_number,
severity_code_clean,
charge,
person_first_name,
person_last_name,
person_age,
charge_date,
person_race_abbr,
person_gender_abbr,
charge_count
)
spurs_clean |> glimpse()Rows: 41,379
Columns: 19
$ arrest_county <chr> "Aransas", "Bee", "Bee", "Cameron", "Cameron", "Ec…
$ arrest_division <chr> "Texas Highway Patrol", "Texas Highway Patrol", "T…
$ arrest_id <dbl> 209274810, 210182331, 223969513, 213772403, 213772…
$ arrest_state <chr> "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXA…
$ arrest_urn <chr> "CMAR00171811", "CMAR00172979", "CMAR00183927", "C…
$ arresting_officer <chr> "Kaitlynn King", "Luke William Kanz", "Justin K Ja…
$ spurs_officer_id <chr> "Kk26997", "Lk23525", "Jj25825", "Jc05699", "Jc056…
$ statute <chr> "46.02(A)", "481.121(B)(1)", "46.02(A)", "481.121(…
$ code <chr> "Penal Code", "Health And Safety Code", "Penal Cod…
$ inv_number <chr> "2022I-THP3-149480", "2022I-THP3-149608", "2022I-T…
$ severity_code_clean <chr> "Misdemeanor Charges", "Misdemeanor Charges", "Mis…
$ charge <chr> "Unl Carrying Weapon", "Possession Of Marihuana <2…
$ person_first_name <chr> "WESLIN", "JOEL", "JESUS", "SERGIO", "SERGIO", "KI…
$ person_last_name <chr> "LEMKE", "CASAS", "CADENA", "ARMENDAREZ-GARCA", "A…
$ person_age <dbl> 23, 27, 52, 23, 23, 29, 29, 45, 29, 17, 24, 24, 24…
$ charge_date <date> 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01, 2…
$ person_race_abbr <chr> "W", "H", "H", "H", "H", "H", "H", "H", "H", "H", …
$ person_gender_abbr <chr> "M", "M", "M", "M", "M", "F", "F", "M", "M", "M", …
$ charge_count <dbl> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,…
Exporting to data_processed
spurs_clean |>
write_rds("data-processed/01-clean-spurs.rds")